Topic Modelling¶
In [2]:
import pandas as pd
df_relevant_articles = pd.read_csv('ai_articles.csv')
In [3]:
df_relevant_articles.dropna(subset=['cleaned_text'], inplace=True)
In [4]:
df_relevant_articles
Out[4]:
| url | date | language | title | text | cleaned_text | tokens | |
|---|---|---|---|---|---|---|---|
| 0 | http://businessnewsthisweek.com/business/infog... | 2023-05-20 | en | Infogain AI Business Solutions Now Available i... | \n\nInfogain AI Business Solutions Now Availab... | infogain ai business solutions now available i... | ['infogain', 'ai', 'business', 'solution', 'av... |
| 1 | https://allafrica.com/stories/202504250184.html | 2025-04-25 | en | Africa: AI Policies in Africa - Lessons From G... | \nAfrica: AI Policies in Africa - Lessons From... | africa ai policies in africa lessons from gha... | ['africa', 'ai', 'policy', 'africa', 'lesson',... |
| 2 | https://asiatimes.com/2023/07/yang-lan-intervi... | 2023-07-25 | en | Yang Lan interviews academics on AI developmen... | \nYang Lan interviews academics on AI developm... | yang lan interviews academics on ai developmen... | ['yang', 'lan', 'interview', 'academic', 'ai',... |
| 3 | https://cdn.meritalk.com/articles/commerce-nom... | 2025-02-04 | en | Commerce Nominee Promises Increased Domestic A... | \nCommerce Nominee Promises Increased Domestic... | commerce nominee promises increased domestic a... | ['commerce', 'nominee', 'promise', 'increased'... |
| 4 | https://citylife.capetown/hmn/uncategorized/re... | 2023-11-11 | en | Revolutionizing the Manufacturing Industry: Th... | Revolutionizing the Manufacturing Industry:... | revolutionizing the manufacturing industry the... | ['revolutionizing', 'manufacturing', 'industry... |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 157501 | https://www.wndu.com/prnewswire/2023/11/15/woo... | 2023-11-15 | en | Woolpert and Allvision Forge Strategic Agreeme... | Woolpert and Allvision Forge Strategic Agreeme... | woolpert and allvision forge strategic agreeme... | ['woolpert', 'allvision', 'forge', 'strategic'... |
| 157502 | https://www.wusf.org/2024-05-17/openais-new-ch... | 2024-05-17 | en | OpenAI's new ChatGPT talks and sings. But how ... | \nOpenAI's new ChatGPT talks and sings. But ho... | openais new chatgpt talks and sings but how hu... | ['openais', 'new', 'chatgpt', 'talk', 'sings',... |
| 157503 | https://www.wuwf.org/2024-06-03/all-eyes-on-ra... | 2024-06-03 | en | ‘All eyes on Rafah’ is the Internet's most vir... | \n‘All eyes on Rafah’ is the Internet's most v... | all eyes on rafah is the internets most viral ... | ['eye', 'rafah', 'internet', 'viral', 'ai', 't... |
| 157504 | https://www.zawya.com/en/press-release/compani... | 2024-04-15 | en | SentinelOne to spotlight Purple AI at GISEC 2024 | SentinelOne to spotlight Purple AI at GISEC 20... | sentinelone to spotlight purple ai at gisec go... | ['sentinelone', 'spotlight', 'purple', 'ai', '... |
| 157505 | https://www.zawya.com/en/press-release/governm... | 2024-06-13 | en | DGHR and DCAI join forces to support the world... | DGHR and DCAI join forces to support the world... | dghr and dcai join forces to support the world... | ['dghr', 'dcai', 'join', 'force', 'support', '... |
157506 rows × 7 columns
In [5]:
print(df_relevant_articles.shape)
(157506, 7)
In [10]:
# Imports
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import pandas as pd
import plotly.io as pio
# Load the full dataset
docs = df_relevant_articles['cleaned_text'].astype(str).tolist()
# Define the embedding model
embedding_model = SentenceTransformer("all-mpnet-base-v2")
# Custom BERTopic components
umap_model = UMAP(n_components=15, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=25, metric='euclidean', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=10)
ctfidf_model = ClassTfidfTransformer()
# Initialize BERTopic
topic_model = BERTopic(
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
top_n_words=15,
nr_topics="auto",
calculate_probabilities=True,
verbose=True
)
# Generate embeddings and fit model
embeddings = embedding_model.encode(docs, show_progress_bar=True)
topics, probs = topic_model.fit_transform(docs, embeddings)
# Assign topics back to original dataframe
df_relevant_articles['topic'] = topics
# Visualize topic summary
fig = topic_model.visualize_barchart(top_n_topics=15)
fig.show()
# View topic table
topic_model.get_topic_info().head(10)
Batches: 0%| | 0/4923 [00:00<?, ?it/s]
2025-05-26 17:55:10,325 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead. 2025-05-26 17:57:28,161 - BERTopic - Dimensionality - Completed ✓ 2025-05-26 17:57:28,165 - BERTopic - Cluster - Start clustering the reduced embeddings huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) 2025-05-26 20:42:23,669 - BERTopic - Cluster - Completed ✓ 2025-05-26 20:42:23,672 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction. 2025-05-26 20:44:35,862 - BERTopic - Representation - Completed ✓ 2025-05-26 20:44:36,055 - BERTopic - Topic reduction - Reducing number of topics 2025-05-26 20:44:37,480 - BERTopic - Representation - Fine-tuning topics using representation models. 2025-05-26 20:47:44,840 - BERTopic - Representation - Completed ✓ 2025-05-26 20:47:44,888 - BERTopic - Topic reduction - Reduced number of topics from 1047 to 171
Out[10]:
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 57406 | -1_ai_laptop_news_new | [ai, laptop, news, new, data, technology, medi... | [dataloop to supercharge adoption timeline for... |
| 1 | 0 | 90058 | 0_ai_news_new_technology | [ai, news, new, technology, data, share, busin... | [generative artificial intelligence research r... |
| 2 | 1 | 432 | 1_sports_nfl_wimbledon_ibm | [sports, nfl, wimbledon, ibm, players, fans, t... | [ibm brings generative ai commentary and ai dr... |
| 3 | 2 | 262 | 2_species_birds_conservation_bird | [species, birds, conservation, bird, animals, ... | [a new ai tool generates synthetic birdsongs t... |
| 4 | 3 | 231 | 3_beatles_mccartney_song_paul mccartney | [beatles, mccartney, song, paul mccartney, pau... | [the beatles are releasing their last record a... |
| 5 | 4 | 228 | 4_weather_forecasting_forecasts_climate | [weather, forecasting, forecasts, climate, pre... | [huaweis panguweather ai model can predict wea... |
| 6 | 5 | 218 | 5_wildfires_smoke_fires_cameras | [wildfires, smoke, fires, cameras, wildfire, f... | [the threat of wildfires is rising so is new a... |
| 7 | 6 | 216 | 6_ab_data_engineer_experience | [ab, data, engineer, experience, management, s... | [data scientist itonline home about us busine... |
| 8 | 7 | 168 | 7_data science_data_science_digi | [data science, data, science, digi, nv, data s... | [switching to data science career guide on tra... |
| 9 | 8 | 160 | 8_nsfw_ai chat_chat_girlfriend | [nsfw, ai chat, chat, girlfriend, characters, ... | [the best site for nsfw ai chat amp ai girlfri... |
In [12]:
topic_info = topic_model.get_topic_info()
print(topic_info)
Topic Count Name \
0 -1 57406 -1_ai_laptop_news_new
1 0 90058 0_ai_news_new_technology
2 1 432 1_sports_nfl_wimbledon_ibm
3 2 262 2_species_birds_conservation_bird
4 3 231 3_beatles_mccartney_song_paul mccartney
.. ... ... ...
166 165 26 165_musicians_album_protest_artists
167 166 26 166_rabbit_device_pocket_january
168 167 26 167_cardiac_iv_ai human_valve
169 168 26 168_fairy_circles_study_patterns
170 169 26 169_buffett_berkshire_ago_omaha
Representation \
0 [ai, laptop, news, new, data, technology, medi...
1 [ai, news, new, technology, data, share, busin...
2 [sports, nfl, wimbledon, ibm, players, fans, t...
3 [species, birds, conservation, bird, animals, ...
4 [beatles, mccartney, song, paul mccartney, pau...
.. ...
166 [musicians, album, protest, artists, bush, cre...
167 [rabbit, device, pocket, january, ces, ai pin,...
168 [cardiac, iv, ai human, valve, monitoring, hea...
169 [fairy, circles, study, patterns, computer vis...
170 [buffett, berkshire, ago, omaha, warren, warre...
Representative_Docs
0 [dataloop to supercharge adoption timeline for...
1 [generative artificial intelligence research r...
2 [ibm brings generative ai commentary and ai dr...
3 [a new ai tool generates synthetic birdsongs t...
4 [the beatles are releasing their last record a...
.. ...
166 [musicians release silent album to protest uk ...
167 [an indepth explanation of ces s ai sensation ...
168 [biotronik announced first implant of new impl...
169 [new discoveries of unexplained fairy circles ...
170 [buffett shares good news on profits ai though...
[171 rows x 5 columns]
In [13]:
# Count total topics (excluding -1)
num_topics = df_relevant_articles['topic'].nunique() - (1 if -1 in df_relevant_articles['topic'].values else 0)
print(f"Number of meaningful topics: {num_topics}")
Number of meaningful topics: 170
In [17]:
topic_model = topic_model.reduce_topics(docs, nr_topics=30)
2025-05-26 21:58:19,668 - BERTopic - Topic reduction - Reducing number of topics 2025-05-26 21:58:19,881 - BERTopic - Representation - Fine-tuning topics using representation models. 2025-05-26 22:03:30,470 - BERTopic - Representation - Completed ✓ 2025-05-26 22:03:30,505 - BERTopic - Topic reduction - Reduced number of topics from 171 to 30
In [20]:
df_relevant_articles['reduced_topic'] = topic_model.topics_
In [22]:
topic_info_reduced = topic_model.get_topic_info()
display(topic_info_reduced.head(10))
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 57406 | -1_ai_news_new_data | [ai, news, new, data, technology, laptop, busi... | [meet the gig workers making ai machines more ... |
| 1 | 0 | 90992 | 0_ai_news_new_data | [ai, news, new, data, technology, business, sh... | [new generative ai study highlights adoption u... |
| 2 | 1 | 1374 | 1_weather_news_ai_new | [weather, news, ai, new, public, help, ago, cl... | [the threat of wildfires is rising so are new ... |
| 3 | 2 | 846 | 2_ago_news_said_hours ago | [ago, news, said, hours ago, hours, public, pe... | [how europe is leading the world in building g... |
| 4 | 3 | 835 | 3_sports_nfl_players_game | [sports, nfl, players, game, ai, gt, games, pl... | [selflearning ai unveils nfl against the sprea... |
| 5 | 4 | 802 | 4_dental_health_ai_clinical | [dental, health, ai, clinical, patients, ivf, ... | [platinum dental services partners with overje... |
| 6 | 5 | 761 | 5_quantum_materials_nobel_ai | [quantum, materials, nobel, ai, physics, compu... | [nextgen superconducting diode enhancing ai pe... |
| 7 | 6 | 544 | 6_ai_church_victor_news | [ai, church, victor, news, human, religious, t... | [toleranceca most australians are worried abo... |
| 8 | 7 | 488 | 7_lg_tv_oled_holographic | [lg, tv, oled, holographic, yes, slide, produc... | [lg c k oled evo with thinq ai oledcpua lg ... |
| 9 | 8 | 436 | 8_data_data science_science_learning | [data, data science, science, learning, ab, da... | [how to get started in data science essential ... |
In [26]:
fig = topic_model.visualize_barchart(top_n_topics=30) # You can change 30 to 15 or 20 for simpler view
fig.show()
In [28]:
fig2 = topic_model.visualize_topics()
fig2.show()
In [30]:
topic_info_reduced = topic_model.get_topic_info()
display(topic_info_reduced.head(30))
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 57406 | -1_ai_news_new_data | [ai, news, new, data, technology, laptop, busi... | [meet the gig workers making ai machines more ... |
| 1 | 0 | 90992 | 0_ai_news_new_data | [ai, news, new, data, technology, business, sh... | [new generative ai study highlights adoption u... |
| 2 | 1 | 1374 | 1_weather_news_ai_new | [weather, news, ai, new, public, help, ago, cl... | [the threat of wildfires is rising so are new ... |
| 3 | 2 | 846 | 2_ago_news_said_hours ago | [ago, news, said, hours ago, hours, public, pe... | [how europe is leading the world in building g... |
| 4 | 3 | 835 | 3_sports_nfl_players_game | [sports, nfl, players, game, ai, gt, games, pl... | [selflearning ai unveils nfl against the sprea... |
| 5 | 4 | 802 | 4_dental_health_ai_clinical | [dental, health, ai, clinical, patients, ivf, ... | [platinum dental services partners with overje... |
| 6 | 5 | 761 | 5_quantum_materials_nobel_ai | [quantum, materials, nobel, ai, physics, compu... | [nextgen superconducting diode enhancing ai pe... |
| 7 | 6 | 544 | 6_ai_church_victor_news | [ai, church, victor, news, human, religious, t... | [toleranceca most australians are worried abo... |
| 8 | 7 | 488 | 7_lg_tv_oled_holographic | [lg, tv, oled, holographic, yes, slide, produc... | [lg c k oled evo with thinq ai oledcpua lg ... |
| 9 | 8 | 436 | 8_data_data science_science_learning | [data, data science, science, learning, ab, da... | [how to get started in data science essential ... |
| 10 | 9 | 410 | 9_fashion_ai_news_art | [fashion, ai, news, art, anime, new, images, a... | [artificial intelligence step out in ai stile... |
| 11 | 10 | 353 | 10_beatles_mccartney_paul mccartney_paul | [beatles, mccartney, paul mccartney, paul, son... | [the beatles are releasing their final record ... |
| 12 | 11 | 329 | 11_said_county_ago_news | [said, county, ago, news, child, hours ago, ho... | [opaque ai tool may flag parents with disabili... |
| 13 | 12 | 280 | 12_ago_news_film_voice | [ago, news, film, voice, writers, hollywood, h... | [could ai pen casablanca screenwriters take ai... |
| 14 | 13 | 268 | 13_shares_etf_ratings_llc | [shares, etf, ratings, llc, robotics, analysts... | [global x robotics artificial intelligence th... |
| 15 | 14 | 188 | 14_aa_update_suicide_doi | [aa, update, suicide, doi, machine, machine le... | [safetylit predicting lifetime suicide attempt... |
| 16 | 15 | 186 | 15_openai_altman_news_ago | [openai, altman, news, ago, sam, ceo, death, s... | [worldcoin scans eyeballs and offers crypto wh... |
| 17 | 16 | 170 | 16_vehicle_vehicles_trailer_chatgpt | [vehicle, vehicles, trailer, chatgpt, cars, fo... | [ford uses ai to make connecting a trailer as ... |
| 18 | 17 | 165 | 17_antarctic_waves_iceberg_rogue | [antarctic, waves, iceberg, rogue, mapping, oc... | [a new era of iceberg mapping how artificial i... |
| 19 | 18 | 155 | 18_league_star_win_premier league | [league, star, win, premier league, england, e... | [ja morants statement sounded like it was writ... |
| 20 | 19 | 142 | 19_republic_kingdom_email_peoples | [republic, kingdom, email, peoples, password, ... | [hilands convoy passes triway ai richlandsou... |
| 21 | 20 | 66 | 20_patent_inventor_court_roberts | [patent, inventor, court, roberts, law, patent... | [can an ai system be an inventor full court sa... |
| 22 | 21 | 66 | 21_stocks_tech stocks_companies_fool | [stocks, tech stocks, companies, fool, investi... | [artificial intelligence ai stocks that could ... |
| 23 | 22 | 54 | 22_dal_salvador_museum_king | [dal, salvador, museum, king, radio, npr, inst... | [an ai salvador dal will answer any question w... |
| 24 | 23 | 36 | 23_shah_rafah_creator_npr | [shah, rafah, creator, npr, instagram, image, ... | [all eyes on rafah is the internets most viral... |
| 25 | 24 | 35 | 24_modi_summit_france_macron | [modi, summit, france, macron, pm modi, india,... | [pm modi to cochair ai summit to open new cons... |
| 26 | 25 | 35 | 25_michael_family_interview_magazine | [michael, family, interview, magazine, german,... | [michael schumachers family plan legal action ... |
| 27 | 26 | 30 | 26_george_special_comedy_lawsuit | [george, special, comedy, lawsuit, estate, def... | [george carlin estate sues over fake ai comedy... |
| 28 | 27 | 28 | 27_tupac_drake_dead_song | [tupac, drake, dead, song, estate, songs, kend... | [it was a classic rap beef then drake revived ... |
| 29 | 28 | 26 | 28_fairy_circles_study_patterns | [fairy, circles, study, patterns, computer vis... | [new discoveries of unexplained fairy circles ... |
In [33]:
reduced_topic_to_industry = {
0: "General AI",
1: "Energy & Environment", # weather, public, climate
2: "Policy & Governance", # said, hours ago, public
3: "Sports & Events", # sports, nfl, players, game
4: "Healthcare", # dental, health, clinical
5: "Science & Research", # quantum, materials, nobel
6: "Ethics & Society", # church, human, religious
7: "Consumer Tech", # LG, TV, OLED, holographic
8: "Data Science & Education", # data science, learning
9: "Fashion & Art", # fashion, images, art
10: "Media & Entertainment", # beatles, paul, music
11: "Public Health", # county, child, care
12: "Film & Storytelling", # hollywood, film, voice
13: "Finance & Investment", # ETF, ratings, shares
14: "Mental Health & Safety", # suicide, update, prediction
15: "OpenAI & Leadership", # altman, openai, sam
16: "Auto & Mobility", # vehicle, trailer, cars
17: "Climate & Oceanography", # iceberg, waves, rogue
18: "Sports & Culture", # league, star, win
19: "Politics & Governance", # republic, kingdom, people
20: "Legal & IP", # patent, inventor, court
21: "Finance & Tech Stocks", # stocks, fool, companies
22: "Museums & Art", # museum, salvador dalí
23: "Digital Creators & Culture", # rafah, instagram, viral
24: "International Politics", # modi, summit, macron
25: "Media & Privacy", # family, interview, magazine
26: "Comedy & Legal Issues", # comedy, lawsuit, george
27: "Music & IP Rights", # tupac, song, estate
28: "Science & Discovery", # fairy circles, patterns
}
In [35]:
# Apply mapping to assign industries
df_relevant_articles['industry'] = df_relevant_articles['reduced_topic'].map(reduced_topic_to_industry)
In [37]:
industry_counts = df_relevant_articles['industry'].value_counts().reset_index()
industry_counts.columns = ['Industry', 'Article Count']
display(industry_counts)
| Industry | Article Count | |
|---|---|---|
| 0 | General AI | 90992 |
| 1 | Energy & Environment | 1374 |
| 2 | Policy & Governance | 846 |
| 3 | Sports & Events | 835 |
| 4 | Healthcare | 802 |
| 5 | Science & Research | 761 |
| 6 | Ethics & Society | 544 |
| 7 | Consumer Tech | 488 |
| 8 | Data Science & Education | 436 |
| 9 | Fashion & Art | 410 |
| 10 | Media & Entertainment | 353 |
| 11 | Public Health | 329 |
| 12 | Film & Storytelling | 280 |
| 13 | Finance & Investment | 268 |
| 14 | Mental Health & Safety | 188 |
| 15 | OpenAI & Leadership | 186 |
| 16 | Auto & Mobility | 170 |
| 17 | Climate & Oceanography | 165 |
| 18 | Sports & Culture | 155 |
| 19 | Politics & Governance | 142 |
| 20 | Legal & IP | 66 |
| 21 | Finance & Tech Stocks | 66 |
| 22 | Museums & Art | 54 |
| 23 | Digital Creators & Culture | 36 |
| 24 | International Politics | 35 |
| 25 | Media & Privacy | 35 |
| 26 | Comedy & Legal Issues | 30 |
| 27 | Music & IP Rights | 28 |
| 28 | Science & Discovery | 26 |
In [39]:
sample_articles = df_relevant_articles.groupby('industry').apply(
lambda x: x.sample(1, random_state=42)
)[['cleaned_text', 'industry', 'reduced_topic']].reset_index(drop=True)
display(sample_articles)
/var/folders/59/_xzwcqns6vj35vgt26hxm7qh0000gn/T/ipykernel_3581/2387598375.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
| cleaned_text | industry | reduced_topic | |
|---|---|---|---|
| 0 | gm explores using chatgpt in vehicles daily m... | Auto & Mobility | 16 |
| 1 | mampiasa ai sy fianarana milina ny mpahay sian... | Climate & Oceanography | 17 |
| 2 | george carlin estate sues over fake comedy spe... | Comedy & Legal Issues | 26 |
| 3 | patsnap to launch first aipowered gpt tool to ... | Consumer Tech | 7 |
| 4 | data science interview preparation course to... | Data Science & Education | 8 |
| 5 | all eyes on rafah is the internets most viral ... | Digital Creators & Culture | 23 |
| 6 | chinas use of ai to operate military satellite... | Energy & Environment | 1 |
| 7 | when it comes to religion artificial intellige... | Ethics & Society | 6 |
| 8 | hm to use ai to create model doppelgangers ama... | Fashion & Art | 9 |
| 9 | voice analysis shows striking similarity betwe... | Film & Storytelling | 12 |
| 10 | global x robotics artificial intelligence etf... | Finance & Investment | 13 |
| 11 | could this undervalued ai company be canadas n... | Finance & Tech Stocks | 21 |
| 12 | kinetica launches generative ai solution for r... | General AI | 0 |
| 13 | overjet partners with dental care alliance pro... | Healthcare | 4 |
| 14 | modi cochairs actionpacked ai summit in paris ... | International Politics | 24 |
| 15 | an indepth analysis of inventorship of ai and ... | Legal & IP | 20 |
| 16 | paul mccartney employs ai to help create the b... | Media & Entertainment | 10 |
| 17 | german publisher apologises for fake schumache... | Media & Privacy | 25 |
| 18 | safetylit road traffic injury prevention the r... | Mental Health & Safety | 14 |
| 19 | an ai salvador dal will answer any question wh... | Museums & Art | 22 |
| 20 | it was a classic rap beef then drake revived t... | Music & IP Rights | 27 |
| 21 | sam altman sam altman wont return as ceo of op... | OpenAI & Leadership | 15 |
| 22 | pakistan is stunned as party of imprisoned exp... | Policy & Governance | 2 |
| 23 | as social media guardrails fade and ai deepfak... | Politics & Governance | 19 |
| 24 | megan thee stallions ai sex tape reignites deb... | Public Health | 11 |
| 25 | the mystery of fairy circles expands artificia... | Science & Discovery | 28 |
| 26 | ai in chemicals market size projected to be wo... | Science & Research | 5 |
| 27 | man united tears slot glory everton progress ... | Sports & Culture | 18 |
| 28 | chatgpt ranks the secs best football uniforms ... | Sports & Events | 3 |
Custom Classification for General AI¶
In [42]:
industry_keywords = {
"Healthcare": ["healthcare", "clinical", "hospital", "patient", "biotech", "medical", "treatment", "disease"],
"Finance": ["bank", "fraud", "fintech", "credit", "investment", "insurance", "loan", "trading", "fund"],
"Retail & Consumer Goods": ["retail", "customer", "shopping", "store", "ecommerce", "product", "purchase", "brand"],
"Education": ["education", "student", "university", "classroom", "school", "curriculum", "learning"],
"Legal & IP": ["legal", "court", "law", "intellectual", "policy", "governance", "regulation", "rights"],
"Technology & Software": ["software", "platform", "tools", "apps", "development", "automation", "api"],
"Media & Entertainment": ["music", "film", "movie", "tv", "video", "beatles", "entertainment", "netflix"],
"Climate & Environment": ["climate", "environment", "carbon", "sustainability", "emissions", "renewable", "energy"],
"Social Media": ["instagram", "twitter", "facebook", "viral", "followers", "content", "platform"],
"Government & Politics": ["government", "policy", "election", "minister", "modi", "summit", "president"],
}
In [44]:
def reclassify_general_ai(text):
text = text.lower()
for industry, keywords in industry_keywords.items():
if any(keyword in text for keyword in keywords):
return industry
return "General AI"
In [46]:
# Only reclassify articles in topic 0 that are currently labeled General AI
mask = (df_relevant_articles['reduced_topic'] == 0) & (df_relevant_articles['industry'] == "General AI")
df_relevant_articles.loc[mask, 'industry'] = df_relevant_articles.loc[mask, 'cleaned_text'].apply(reclassify_general_ai)
In [48]:
industry_counts_updated = df_relevant_articles['industry'].value_counts().reset_index()
industry_counts_updated.columns = ['Industry', 'Article Count']
display(industry_counts_updated)
| Industry | Article Count | |
|---|---|---|
| 0 | Healthcare | 38598 |
| 1 | Finance | 32653 |
| 2 | Retail & Consumer Goods | 14929 |
| 3 | Education | 3411 |
| 4 | Legal & IP | 1794 |
| 5 | Energy & Environment | 1374 |
| 6 | Policy & Governance | 846 |
| 7 | Sports & Events | 835 |
| 8 | Science & Research | 761 |
| 9 | Ethics & Society | 544 |
| 10 | Consumer Tech | 488 |
| 11 | Media & Entertainment | 462 |
| 12 | Data Science & Education | 436 |
| 13 | Fashion & Art | 410 |
| 14 | Public Health | 329 |
| 15 | Technology & Software | 294 |
| 16 | Film & Storytelling | 280 |
| 17 | Finance & Investment | 268 |
| 18 | Mental Health & Safety | 188 |
| 19 | OpenAI & Leadership | 186 |
| 20 | Auto & Mobility | 170 |
| 21 | Climate & Oceanography | 165 |
| 22 | Sports & Culture | 155 |
| 23 | Politics & Governance | 142 |
| 24 | Finance & Tech Stocks | 66 |
| 25 | Museums & Art | 54 |
| 26 | Digital Creators & Culture | 36 |
| 27 | International Politics | 35 |
| 28 | Media & Privacy | 35 |
| 29 | Comedy & Legal Issues | 30 |
| 30 | General AI | 28 |
| 31 | Music & IP Rights | 28 |
| 32 | Science & Discovery | 26 |
| 33 | Social Media | 18 |
| 34 | Government & Politics | 14 |
| 35 | Climate & Environment | 12 |
In [62]:
final_industry_mapping = {
'Finance': 'Finance',
'Healthcare': 'Healthcare',
'Retail & Consumer Goods': 'Retail & Consumer Goods',
'Retail & Food': 'Retail & Consumer Goods',
'Education': 'Education',
'Education & History': 'Education',
'Legal': 'Legal',
'Media & Entertainment': 'Media & Entertainment',
'Marketing & Advertising': 'Marketing & Advertising',
'Media & Tech News': 'Media & Entertainment',
'Media & Policy': 'Media & Entertainment',
'Entertainment & Performing Arts': 'Media & Entertainment',
'Technology & Software': 'Technology & Software',
'Software Development': 'Technology & Software',
'AI Safety': 'Technology & Software',
'Energy & Environment': 'Energy & Environment',
'Climate Science': 'Energy & Environment',
'Space & Astronomy': 'Energy & Environment',
'Oceanography & Science': 'Energy & Environment',
'Social Media': 'Social Media',
'Social Media Tools': 'Social Media',
'Publishing & Journalism': 'Culture & Journalism',
'Museums & Culture': 'Culture & Journalism',
'Digital Legacy & Ethics': 'Ethics & Society',
'Surveillance & Location': 'Security & Surveillance',
'General AI': 'General AI',
'Elder Care': 'Healthcare',
'Sports & Events': 'Media & Entertainment',
'Science & Research': 'Science & Research', # or its own if needed
'Politics & Governance': 'Government & Politics',
'International Politics': 'Government & Politics',
'Legal & IP': 'Legal',
'Climate & Oceanography': 'Energy & Environment',
'Media & Privacy': 'Media & Entertainment',
'Digital Creators & Culture': 'Media & Entertainment',
'Fashion & Art': 'Media & Entertainment',
'OpenAI & Leadership': 'Technology & Software',
'Music & IP Rights': 'Media & Entertainment',
'Comedy & Legal Issues': 'Media & Entertainment',
'Mental Health & Safety': 'Healthcare',
'Public Health': 'Healthcare',
'Film & Storytelling': 'Media & Entertainment',
'Auto & Mobility': 'Technology & Software',
'Ethics & Society': 'Ethics & Society',
'Government & Politics': 'Government & Politics',
'Science & Discovery': 'Energy & Environment',
'Consumer Tech': 'Technology & Software',
'Data Science & Education': 'Education'
}
In [64]:
df_relevant_articles['final_industry'] = df_relevant_articles['industry'].map(final_industry_mapping)
In [66]:
final_industry_counts = df_relevant_articles['final_industry'].value_counts().reset_index()
final_industry_counts.columns = ['Final Industry', 'Total Articles']
display(final_industry_counts)
| Final Industry | Total Articles | |
|---|---|---|
| 0 | Healthcare | 39115 |
| 1 | Finance | 32653 |
| 2 | Retail & Consumer Goods | 14929 |
| 3 | Education | 3847 |
| 4 | Media & Entertainment | 2116 |
| 5 | Legal | 1794 |
| 6 | Energy & Environment | 1565 |
| 7 | Technology & Software | 1138 |
| 8 | Science & Research | 761 |
| 9 | Ethics & Society | 544 |
| 10 | Government & Politics | 191 |
| 11 | General AI | 28 |
| 12 | Social Media | 18 |
Plots¶
In [71]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Make sure date is datetime type
df_relevant_articles['date'] = pd.to_datetime(df_relevant_articles['date'], errors='coerce')
# Drop rows without valid dates
df_time = df_relevant_articles.dropna(subset=['date'])
# Optional: Choose top N industries for clarity
top_industries = df_time['final_industry'].value_counts().head(6).index.tolist()
df_time = df_time[df_time['final_industry'].isin(top_industries)]
# Group by month and industry
df_time['month'] = df_time['date'].dt.to_period('M').dt.to_timestamp()
df_grouped = df_time.groupby(['month', 'final_industry']).size().reset_index(name='Article Count')
# Plot
plt.figure(figsize=(14, 7))
sns.lineplot(data=df_grouped, x='month', y='Article Count', hue='final_industry', marker='o')
plt.title("AI Article Volume Over Time by Industry", fontsize=16)
plt.xlabel("Month")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
In [83]:
df_plot = final_industry_counts.sort_values(by='Total Articles', ascending=True)
# Bar chart
plt.figure(figsize=(12, 8))
sns.barplot(data=df_plot, x='Total Articles', y='Final Industry', palette='RdPu')
plt.title("AI Articles Across Industries", fontsize=16)
plt.xlabel("Number of Articles")
plt.ylabel("Industry")
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
In [85]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Sort values
df_plot_sorted = final_industry_counts.sort_values(by='Total Articles', ascending=False).reset_index(drop=True)
# Calculate total and percentage
total_articles = df_plot_sorted['Total Articles'].sum()
df_plot_sorted['percentage'] = df_plot_sorted['Total Articles'] / total_articles
# Threshold to group small slices
threshold = 0.01 # 1%
major_industries = df_plot_sorted[df_plot_sorted['percentage'] >= threshold]
others = df_plot_sorted[df_plot_sorted['percentage'] < threshold]
other_sum = others['Total Articles'].sum()
# Combine "Other" if needed
final_pie_data = major_industries[['Final Industry', 'Total Articles']].copy()
if other_sum > 0:
other_row = pd.DataFrame([{'Final Industry': 'Other', 'Total Articles': other_sum}])
final_pie_data = pd.concat([final_pie_data, other_row], ignore_index=True)
# Plot
plt.figure(figsize=(10, 10))
colors = sns.color_palette('RdPu', len(final_pie_data))
plt.pie(
final_pie_data['Total Articles'],
labels=final_pie_data['Final Industry'],
autopct='%1.1f%%',
startangle=140,
colors=colors,
wedgeprops={'edgecolor': 'white'},
textprops={'fontsize': 11}
)
plt.title("AI Article Distribution by Industry", fontsize=16)
plt.tight_layout()
plt.show()
Temporal Trends Analysis¶
In [90]:
# Ensure 'date' column is datetime
df_relevant_articles['date'] = pd.to_datetime(df_relevant_articles['date'], errors='coerce')
# Drop rows with missing dates
df_time = df_relevant_articles.dropna(subset=['date']).copy()
# Add 'month' column (truncate to month)
df_time['month'] = df_time['date'].dt.to_period('M').dt.to_timestamp()
In [96]:
import seaborn as sns
import matplotlib.pyplot as plt
# Group by month and industry (no filtering now)
monthly_counts_all = df_time.groupby(['month', 'final_industry']).size().reset_index(name='Article Count')
# Plot all industries
plt.figure(figsize=(16, 8))
sns.lineplot(data=monthly_counts_all, x='month', y='Article Count', hue='final_industry', marker='o')
plt.title('Monthly AI Article Volume Across All Industries', fontsize=16)
plt.xlabel('Month')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.legend(title='Industry', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
plt.show()
In [94]:
# Pivot data for stacked area chart
pivot_area = df_top.groupby(['month', 'final_industry']).size().unstack(fill_value=0)
# Plot
pivot_area.plot(kind='area', figsize=(14, 6), cmap='Set2', stacked=True)
plt.title('Share of AI Article Coverage Over Time by Industry')
plt.xlabel('Month')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.legend(title='Industry', bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.show()
In [105]:
import seaborn as sns
import matplotlib.pyplot as plt
# Convert to proper string format for display
heatmap_data.columns = heatmap_data.columns.strftime('%Y-%m') # Clean month labels
# Plot
plt.figure(figsize=(16, 8))
sns.heatmap(
heatmap_data,
cmap="YlGnBu",
linewidths=0.3,
linecolor='white',
cbar_kws={'label': 'Number of Articles'}
)
plt.title("📅 AI Article Volume Heatmap (Industry vs. Month)", fontsize=16)
plt.xlabel("Month")
plt.ylabel("Industry")
plt.xticks(rotation=45, ha='right') # Tilt for readability
plt.tight_layout()
plt.show()
In [111]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Combine all cleaned_texts by industry
industry_texts = df_relevant_articles.groupby('final_industry')['cleaned_text'].apply(lambda x: ' '.join(x)).reset_index()
# Plot word clouds for top 6 industries
top_inds = df_relevant_articles['final_industry'].value_counts().head(6).index.tolist()
industry_texts = industry_texts[industry_texts['final_industry'].isin(top_inds)]
# Generate and plot word clouds
for _, row in industry_texts.iterrows():
wc = WordCloud(width=800, height=400, background_color='white').generate(row['cleaned_text'])
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title(f"Word Cloud: {row['final_industry']}", fontsize=16)
plt.tight_layout()
plt.show()
In [112]:
# This shows similarity between topics (c-TF-IDF vectors)
fig = topic_model.visualize_heatmap(top_n_topics=30)
fig.show()
In [115]:
# Shows semantic relationships between topics in 2D space
fig = topic_model.visualize_topics()
fig.show()
In [135]:
# Save classified articles to CSV
df_relevant_articles.to_csv("ai_articles_by_industry.csv", index=False)